本次赛题为《阿里移动推荐算法》,以阿里巴巴移动电商平台的真实用户-商品行为数据为基础,同时提供移动时代特有的位置信息,而参赛队伍则需要通过大数据和算法构面向建移动电子商务的商品推荐模型。希望参赛队伍能够挖掘数据背后丰富的内涵,为移动用户在合适的时间、合适的地点精准推荐合适的内容。
在真实的业务场景下,我们往往需要对所有商品的一个子集构建个性化推荐模型。在完成这件任务的过程中,我们不仅需要利用用户在这个商品子集上的行为数据,往往还需要利用更丰富的用户行为数据。定义如下的符号:U——用户集合I——商品全集P——商品子集,P ⊆ I*D——用户对商品全集的行为数据集合那么我们的目标是利用D来构造U中用户对P*中商品的推荐模型。
[天池新人实战赛之[离线赛]][website].
xxxxxxxxxx# coding: utf-8# In[71]:import timeimport datetimeimport numpy as npimport mathfrom sklearn.linear_model import LogisticRegression# ############# reading data from files# In[87]:#with open('./tianchi_fresh_comp_train_user.csv') as f:# raw_data = f.read().splitlines()controlsize = 100000size = 0raw_list = []for line in open("./tianchi_fresh_comp_train_user.csv"): if(size == 0): size += 1 continue raw_list.append(line) size += 1 if(size == controlsize): breakraw_list = map(lambda line : line.split(","), raw_list)# In[88]:############# [Test Code]raw_list# In[113]:train_data = []train_data28 = []train_data29 = []train_data30 = []test = []for line in raw_list: day = line[-1][:10].split('-') #计算两个日期之间的天数 d1=datetime.datetime(2014,11,18) d2=datetime.datetime(int(day[0]),int(day[1]),int(day[2])) diff_days = (d2-d1).days uid = (int(line[0]), int(line[1]), int(line[2]), int(line[4]), diff_days) train_data.append(uid) if(diff_days <= 28): train_data28.append(uid) elif(diff_days == 29): train_data29.append(uid) elif(diff_days == 30): train_data30.append(uid) elif(diff_days > 30): test.append(uid)train_data = list(set(train_data))train_data28 = list(set(train_data28))train_data29 = list(set(train_data29))train_data30 = list(set(train_data30))# In[90]:############# [Test Code]train_data# In[114]:############# [Test Code]train_data28# ############# data pre-processing# In[99]:def additem(uid, typeid, ui_dict, ui_buy): if uid in ui_dict[typeid]: ui_dict[typeid][uid] += 1 else: ui_dict[typeid][uid] = 1 if typeid == 3: ui_buy[uid] = 1 return ui_dict, ui_buy# In[100]:# for featureui_dict = [{} for i in range(4)]# for labelui_buy = {}for line in train_data: day = line[-1] if(day < 28): day = 28 uid = (line[0], line[1], day) typeid = line[2] - 1 ui_dict, ui_buy = additem(uid, typeid, ui_dict, ui_buy) for newday in range(day+1, 31): uid = (line[0], line[1], newday) #print uid, ui_dict, ui_buy = additem(uid, typeid, ui_dict, ui_buy) #print ;# In[101]:############# [Test Code]ui_dict# In[102]:############# [Test Code]print len(train_data),len(ui_dict)print len(ui_dict[0]),len(ui_dict[1]),len(ui_dict[2]),len(ui_dict[3])# In[103]:############# [Test Code]ui_buy# In[135]:# get train X,Yx = np.zeros((len(train_data29), 4))y = np.zeros((len(train_data29), ))index = 0for line in train_data29: uid = (line[0], line[1], line[-1]-1) for i in range(4): x[index][i] = math.log1p( ui_dict[i][uid] if uid in ui_dict[i] else 0 ) uid = (line[0], line[1], line[-1]) y[index] = 1 if uid in ui_buy else 0 index += 1# In[136]:# get prediction pxpx = np.zeros((len(train_data30), 4))index = 0for line in train_data30: uid = (line[0], line[1], line[-1]-1) for i in range(4): px[index][i] = math.log1p( ui_dict[i][uid] if uid in ui_dict[i] else 0 ) index += 1# In[120]:############# [Test Code]print xprint yprint px# ############# training# In[137]:model = LogisticRegression()model.fit(x, y)# ############# predicting# In[138]:py = model.predict_proba(px)npy = []for item in py: npy.append(item[1])py = npy# In[123]:############# [Test Code]py# In[139]:# combine and sort by predict scorelx = zip(train_data30, py)lx = sorted(lx, key = lambda x:x[1], reverse = True)# In[140]:############# [Test Code]lx# In[130]:wf = open('ans.csv', 'w')wf.write('user_id,item_id\n')for i in range(len(lx)): item = lx[i] if(item[1] < 0.5): break wf.write('%s,%s\n' %(item[0][0], item[0][1]))wf.close()# ############# evaluating# In[141]:size_predictionset = 0for i in range(len(lx)): item = lx[i] if(item[1] >= 0.5): size_predictionset += 1size_referenceset = 0for i in range(len(lx)): item = lx[i] if(item[0][2] == 4): size_referenceset += 1size_predictionset_referenceset = 0for i in range(len(lx)): item = lx[i] if(item[0][2] == 4): size_predictionset_referenceset += 1 if(item[1] < 0.5): breakP = 1.0 * size_predictionset_referenceset / size_predictionset * 100R = 1.0 * size_predictionset_referenceset / size_referenceset * 100F1 = 2.0 * P * R / ( P + R )print 'precision: %.2f%%' %( P )print 'recal: %.2f%%' % ( R )print 'F1: %.2f%%' % ( F1 )# In[ ]:如果有相关问题,联系邮箱:misterrxzh@gmail.com
xxxxxxxxxx